data("instacart")
head(instacart, 10)
## # A tibble: 10 x 15
##    order_id product_id add_to_cart_ord~ reordered user_id eval_set order_number
##       <int>      <int>            <int>     <int>   <int> <chr>           <int>
##  1        1      49302                1         1  112108 train               4
##  2        1      11109                2         1  112108 train               4
##  3        1      10246                3         0  112108 train               4
##  4        1      49683                4         0  112108 train               4
##  5        1      43633                5         1  112108 train               4
##  6        1      13176                6         0  112108 train               4
##  7        1      47209                7         0  112108 train               4
##  8        1      22035                8         1  112108 train               4
##  9       36      39612                1         0   79431 train              23
## 10       36      19660                2         1   79431 train              23
## # ... with 8 more variables: order_dow <int>, order_hour_of_day <int>,
## #   days_since_prior_order <int>, product_name <chr>, aisle_id <int>,
## #   department_id <int>, aisle <chr>, department <chr>

Line Plot:

continous x and y

We want to know as the times of reorder increase, will the days before reorder be more fixed (the reason can be, for example, the customers have learned how long it will take to consume the prior-ordered stuff). We will use orders of "Large Alfresco Eggs" to make a plot, because in general the amount of eggs used is fairly constant among long period, and the "Large Alfresco Eggs" is the best seller .

#find out the best seller in "eggs"

instacart %>%
  select(aisle, product_name) %>% 
    filter( 
    aisle == "eggs"
  ) %>% 
  group_by(product_name) %>% 
  count() %>% 
  arrange()
## # A tibble: 120 x 2
## # Groups:   product_name [120]
##    product_name                          n
##    <chr>                             <int>
##  1 100% Egg Whites                      10
##  2 100% Liquid Egg Whites               77
##  3 All Natural 100% Egg Whites           2
##  4 All Whites 100% Egg Whites          176
##  5 All Whites 100% Liquid Egg Whites    94
##  6 Brown Eggs                          160
##  7 Brown Extra Large Grade AA Eggs      81
##  8 Brown Fertile Jumbo Grade AA Eggs    16
##  9 Brown Fertile Large Grade AA Eggs   187
## 10 Brown Large Eggs Grade A              9
## # ... with 110 more rows
instacart %>%
  filter(
    days_since_prior_order %in% 1:29,
    order_number >= 3, 
    product_name == "Large Alfresco Eggs"
  ) %>% 
  mutate(
    order_number = as.numeric(order_number),
    days_since_prior_order = as.numeric(days_since_prior_order)) %>% 
  select(order_number, days_since_prior_order) %>% 
  arrange(days_since_prior_order) %>% 
  plot_ly(
    x = ~order_number, y = ~days_since_prior_order, type = 'scatter', mode = "markers")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Box Plot:

continous y and categorical x

Bar Plot:

count y and categorical x

We want to yake a look at the distribution of total number fresh vegatable orders among 24 hours (No orders at 3 am or 4 am).

instacart %>% 
  filter(
    aisle == "spirits"
  ) %>% 
  group_by(order_hour_of_day) %>% 
    count(aisle) %>% 
  mutate(
    order_hour_of_day = as_factor(order_hour_of_day), 
    text_label = str_c("Order Number: ", n)) %>% 
  plot_ly(
    x = ~order_hour_of_day, y = ~n, color = ~order_hour_of_day, type = "bar", 
    colors = "viridis",  text = ~text_label, alpha = 0.8
  )